
# coding: utf-8

# In[15]:

import pandas as pd
import os


# In[16]:

os.chdir("D:/KS/KRpatent/matching/uspto_family")


# In[17]:

US = pd.read_csv('assgid.csv',sep = ',')
family = pd.read_csv('family_temp.csv',sep = ',')
assg = pd.read_csv('assg_weight.csv', sep=',')


# In[18]:

US_family = pd.merge(US, family, on = 'wku', how = 'inner')
US_family_assg = pd.merge(US_family,assg,on = 'appnum', how = 'inner')
id_list = list(set(US_family_assg.assgid.tolist()))
len(US_family_assg)


# In[19]:

id_list[:20]


# In[20]:

total = US_family_assg[['assgid','weight']].groupby(['assgid']).sum()
total = total.rename(columns = {'weight' : 'total_wgt'})
US_family_assg = pd.merge(US_family_assg,total,on = 'assgid', how = 'left')
US_family_assg['rs'] = US_family_assg.weight / US_family_assg.total_wgt
US_family_assg = US_family_assg.groupby(['assgid'])

del total, US_family, US, family, assg 


# In[21]:

match = {}

for id in id_list:
    group = US_family_assg.get_group(id)
    score = group[['kiprisid','rs']].groupby(['kiprisid']).sum().nlargest(2,'rs')
    if len(score) > 1:
        if score.at[score.rs.index.values[0],'rs'] != score.at[score.rs.index.values[1],'rs']:
             match[id] = [score.rs.index.values[0], score.at[score.rs.index.values[0],'rs']]
        else:
            continue
    else:
        match[id] = [score.rs.index.values[0], score.at[score.rs.index.values[0],'rs']]
   

len(match)


# In[22]:

temp = pd.DataFrame.from_dict(match, orient = 'index', columns = ['kiprisid','relative_score'])
temp['assgid'] = temp.index
del match, US_family_assg, id_list
temp[temp.relative_score < 0.4]


# In[23]:

usstd = pd.read_stata('USPTO_KR_assg.dta')
dg_match = pd.read_stata('all_matches.dta')
dgstd = pd.read_stata('dg_namstand.dta')
kipstd = pd.read_stata('kipris_namstand.dta')


# In[24]:

temp = pd.merge(temp,usstd,on = 'assgid', how = 'inner')
temp = pd.merge(temp,dg_match, on = 'kiprisid', how = 'left')[['kiprisid','relative_score','assgid','standard_name','stem_name','Symbol']]
temp = pd.merge(temp,dgstd, on = 'Symbol', how = 'left')[['kiprisid','relative_score','assgid','standard_name','stem_name','Symbol','dgstd','dgstem']]
result = pd.merge(temp, kipstd, on = 'kiprisid', how = 'left')[['assgid','standard_name','stem_name','relative_score','kiprisid','kipstd','kipstem','Symbol','dgstd','dgstem']]
print(result[:20], len(result), sep = '\n')


# In[25]:

result.to_csv('family_match.csv',sep = '*')


# In[ ]:



